import os
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
import feather
import plotly.express as px
import psutil
import libpysal as lp
import esda
import rasterio as rio
import contextily as ctx
import shapely.geometry as geom
import mapclassify as mc
%matplotlib inline
os.chdir('..')
os.chdir('data')
os.getcwd()
In this notebook, we will conduct some exploratory spatial data analysis on the afgifte dataset. Unlike the previous general ESDA (see afgifte_esda), we will explore three dependent variables: log(kg) of secondary resources received in each postcode for the agriculture, manufacturing, and construction industry, which is the SBI codes A, C, and F respectively. In this notebook, we will do:
Here we prepare a geodataframe for the ESDA. The dataframe should contain the following columns:
*I only included the sbi sections codes (A, C, F), and not the chapter codes (01, 02, 03). This is because, if the chapter codes were also included, it would lead to a double count when counting rows by sbi sections per postcode. This would lead to misleading results in plotting the distribution as well as mapping of data.
# clean df
df = feather.read_dataframe('lma/af_2019-2020_matched-codes.feather')
df = df[['eaNaam','eaPostcode', 'gnc', 'ewc', 'vmc', 'sbi', 'kg']]
# df.sbi = df.sbi.replace('nan', '0')
# df = df.replace([None, 'nan', '0'], '--')
# change codes to chapters
df.gnc = df.gnc.str[:2]
df.ewc = df.ewc.str[:2]
df.vmc = df.vmc.str[:1]
df.eaPostcode = df.eaPostcode.str[:4]
# material column
def mat(row):
if row.gnc == '--':
mat = 'EWC-' + row.ewc
else:
mat = 'GNC-' + row.gnc
return mat
df['mat'] = df.apply(lambda row: mat(row), axis=1)
# SBI CHAPTERS AND SECTIONS
df.sbi = df.sbi.str.split(',')
def chap(sbis):
newSbis = []
for sbi in sbis:
newSbis.append(sbi[:2])
return list(set(newSbis))
df.sbi = df.sbi.map(lambda x: chap(x))
df['sbiLen'] = df.sbi.map(lambda x: len(x))
df.sbi = df.sbi.map(lambda x: ','.join(x))
def sbilen(row):
if row.sbiLen >= 2:
row.sbi = 'mul'
else:
row.sbi = row.sbi
return row
df = df.apply(lambda row: sbilen(row), axis=1)
df = df[['eaPostcode', 'mat', 'vmc', 'sbi', 'kg']]
# assign sbi chapters (01, 02, 03, ...) to sbi sections (A, C, F, ...)
sbi = feather.read_dataframe('classification/sbi_Headings.feather')
secMake = ['A', 'C', 'F'] # A (agriculture), C (manufacturing), and F (construction)
sbiMake = sbi[sbi.section.isin(secMake)]
print('number of chapters included as making industry: {}'.format(len(sbiMake) - len(secMake)))
print(sbiMake.sbi.unique())
print('')
# merge LMA flows with with sbi section headings
df = df.groupby(['eaPostcode', 'sbi']).sum().reset_index().sort_values('kg', ascending=False)
df = pd.merge(df, sbi, how='left', on='sbi')
df.section = df.section.replace(np.NaN, '--')
def na(row):
if row.section == '--':
row.section = row.sbi
return row
df = df.apply(lambda row: na(row), axis=1)
df = df.groupby(['eaPostcode', 'section']).sum().reset_index().sort_values('kg', ascending=False)
# merge with postcode-4 shape file of nl
# read pc4 shpfile
pc4 = gpd.read_file('spatial-data/nl_pc4.shp')
pc4 = pc4[['PC4', 'geometry']]
# merge with pc4
df = pd.merge(df, pc4, how='left', left_on='eaPostcode', right_on='PC4')
df = gpd.GeoDataFrame(df)
# CALCULATE LOG VALUES
def log(x, logType):
if x > 0:
if logType == 'logn':
return np.log(x)
if logType == 'log10':
return np.log10(x)
else:
return 0
df['logn'] = df.kg.map(lambda x: log(x, logType='logn'))
df['log10'] = df.kg.map(lambda x: log(x, logType='log10'))
# display
mul = df[df.section == 'mul'].kg.sum()
unk = df[df.section == '--'].kg.sum()
tot = df.kg.sum()
print('% kg with multiple sbis: {}\n% kg with unknown sbi(s): {}\n% kg with unknown or multiple sbis: {}'.format(
round(mul/tot*100,2), round(unk/tot*100,2), round((mul+unk)/tot*100,2)))
print(df.shape)
df.head()
The number of unmatched rows for sbi codes is too high - almost half of the waste (by weight) could not be assigned to an industry. This might explain the strange results later on in the notebook, such as the manufacturing eerste afnemers being so sparse and seemingly randomly distributed across the Netherlands (and not clustered in the major industrial areas in Rotterdam and Amsterdam).
The SBI code matching process needs to be vastly improved before we can do any meaningful analysis on the flows from an 'industry' perspective. Otherwise, if we are unable to improve the matching performance, we should try using a 'materials' perspective instead. Speaking of the 'industry' versus 'materials' perspective, more consideration is needed to pick one perspective (and subsequently categorization method).
# separate dfs for each industry type - A agriculture, C manufacturing, F construction, and all maker industries (combine A, C, F)
industryFlows = []
# for each type of industry flow, make gdf with all postcodes, where column nlog = log(kg) received by industry v
for i, v in enumerate(['A', 'C', 'F', ['A', 'C', 'F']]):
# make gdf
temp = df[df.section.isin(list(v))][['PC4', 'section', 'kg', 'logn', 'log10']]
temp = pd.merge(pc4, temp, how='left', on='PC4') # ensure that lma data has same shape of pc4 shpfile
temp.section = temp.section.fillna('--')
temp.kg = temp.kg.fillna(0)
temp.logn = temp.logn.fillna(0)
# add temp gdf to list of industryFlows
industryFlows.append(temp)
# attribute gdfs to variable
agri = industryFlows[0]
manu = industryFlows[1]
cons = industryFlows[2]
allin = industryFlows[3]
# adjust allin gdf to aggregate duplicate rows caused by multiple sbi sections in the same postcode
allin = allin.dissolve(by='PC4', aggfunc='sum').reset_index()[['PC4', 'kg', 'geometry']]
def log(x, logType):
if x > 0:
if logType == 'logn':
return np.log(x)
if logType == 'log10':
return np.log10(x)
else:
return 0
allin['logn'] = allin.kg.map(lambda x: log(x, logType='logn'))
allin['log10'] = allin.kg.map(lambda x: log(x, logType='log10'))
# add dfs to industryFlows
industryFlows = [agri, manu, cons, allin]
Here we take a first glance of the data by creating histograms. The three figures below show the distribution of values (kg received) throughout the dataset for each type of industry flow. This step doesn't say anything about the spatial relationships - we will explore in the rest of this notebook.
# histogram of the three industries
fig, ax = plt.subplots(3,4, figsize=(8*3,4*3))
# original values (not log transformations)
title = ['agriculture', 'manufacturing', 'construction', 'all production-related']
for i, d in enumerate(industryFlows):
d = d[d.kg != 0] # remove zero rows
sn.histplot(data=d, x="kg", bins=30, ax=ax[0,i])
ax[0,i].set_title('# postcodes for {} industry'.format(title[i]))
# nlog distributions
for i, d in enumerate(industryFlows):
d = d[d.kg != 0] # remove zero rows
sn.histplot(data=d, x="logn", bins=30, ax=ax[1,i])
# base10 log distributions
for i, d in enumerate(industryFlows):
d = d[d.kg != 0] # remove zero rows
sn.histplot(data=d, x="log10", bins=30, ax=ax[2,i])
plt.show()
Like the total kg received, the kg received per industry also follows a log distribution, meaning that there are lots of postcodes receiving a low kg of waste, and a few postcodes receiving high amounts of waste. However, after log-transformation, the manufacturing industry doesn't seem to follow a normal distribution very well. This could be because there aren't a lot of postcodes associated wtih manufacturing industry in the first place. The construction and agriculture industry seems to follow a normal distribution pretty well.
I'm also trying to understand whether to choose a log transformation of base 10 or base e (natural log). Both log transformation result in the exact same shape, but the scale on the x-axis is different. I'm still trying to understand why that is the case.
Here we map out the kg and log(kg) of secondary resources received for each industry type. Some observations of the maps:
fig, ax = plt.subplots(1,4, figsize=(8*4,7))
# original values (without log transformation)
titles = ['agri-industry receivers', 'manufacturing industry receivers', 'construction industry receivers', 'all industry receivers']
# natural log values
for i, v in enumerate(['A', 'C', 'F', ['A', 'C', 'F']]):
temp = df[df.section.isin(list(v))]
pc4.plot(ax=ax[i], color='lightgrey')
temp.plot(ax=ax[i], column='logn', cmap='OrRd', legend=True)
ax[i].axis('off')
ax[i].set_title(titles[i] + ' (log values)')
plt.show()
Another aspect of the data that we need to consider are the zero values. 60% of postcodes in the Netherlands don't receive any secondary resources - this is highlighted in red in the map below. That is a lot of datapoints that will skew the data - so we need to think about whether to include the postcodes with zero kg received in our analysis later. For now, I've kept the zero values, because I think it's important to take these postcodes into account.
fig, ax = plt.subplots(1,4, figsize=(8*4,8*2))
titles = ['agri-industry receivers', 'manufacturing industry receivers', 'construction industry receivers', 'all industry receivers']
for i, v in enumerate(['A', 'C', 'F', ['A', 'C', 'F']]):
temp = df[df.section.isin(list(v))]
pc4.plot(ax=ax[i], color='red')
temp.plot(ax=ax[i], color='lightgrey')
ax[i].axis('off')
ax[i].set_title('zero values for ' + titles[i])
percZero = round((len(pc4) - len(temp)) / len(pc4) * 100, 2)
print('% postcodes with zero values for {}: {}%'.format(titles[i], percZero))
print('')
plt.show()
Before conducting any spatial analysis, it's important to find out whether the data has any relationship to space - does the data follow any kind of spatial pattern, or are the values just randomly distributed through space? Although we can answer this question from just looking at the map and trying to detect patterns with our eyes, we can also use spatial statistics to get a more accurate understanding of the data.
In order to do this, we look at the dataset and ask the question: 'If we threw values (kg) randomly on the map in simulations, how likely would these simulations resemble our actual data?' or in other words, 'what is the probability that our data (on kg received per pc) is randomly distributed in space?'
If we find that our data is not randomly distributed in space, we can assume that our data is somehow affected by geographical space, and proceed with our spatial analysis. However, if we find that our data is randomly distributed in space, there is not point in conducting further spatial analysis. In this case, we will have to further separate the data (by material type, for example) until we get a dataset that isn't randomly distributed in space.
Before we answer the questions above, we first need to find out for each postcode who its neighbors are, and define what we mean by 'neighbors'. For our analysis, we define neighbors as any postcodes that share an edge or a vertex (point). This definition is called a 'queen spatial weight', in reference to the queen chess piece.
Then, we need to describe, for each postcode, the conditions of its neighborhood. We do this by using a process of calculation called spatial lag. Here, for each postcode, we identify its neighbors, then take an average of all their values. This average neighbors' value is then attributed to the postcode. You can see this process in the maps below - the map on the left shows the actual kg received for each post code. The map on the right shows the average kg received of each postcode by its neighbors (and not itself).
This allows us to understand whether the postcodes in our data have relationship to its spatial neighbors. For example, do postcodes with high values tend have neighbors with high values as well? Or is it just random, where some high value postcodes have high value neighbors, while others don't?
# calculate spatial weights of NL (they are the same for all industry flows, since they are all in NL)
wq = lp.weights.Queen.from_dataframe(pc4) # queen spatial weights - for each pc, identify neighbors (other pcs that share a vertex or edge)
wq.transform = 'r' # weights get row normalized, so when you sum the row, it equals 1
# calculate spatial lag
for i in industryFlows:
# describe neighborhood condition for each PC
y = i['kg']
ylag = lp.weights.lag_spatial(wq, y) # spatial lag - using queen weights (wq), calculate spatial lag (average value of neighborhood) for log_kg (y)
i['lag_logn'] = ylag
def log(x):
if x > 0:
return np.log(x)
else:
return 0
i.lag_logn = i.lag_logn.map(lambda x: log(x))
As seen above, the pc4 shpfile has two islands. These postcodes don't have direct neighbors (aka don't share an edge or vertex with another postcode). This will pose problems further down the notebook - calculating local moran's I requires all elements to have at least one neighbor. For now, I solve the problem by just dropping these two postcodes just before I do the Local Moran's I calculations, but there are better solutions:
# plot normal values and spatial lag values
fig, ax = plt.subplots(2,4, figsize=(8*4,7*2))
titles = ['agri', 'manufacturing', 'construction', 'all']
for i,v in enumerate(industryFlows):
v.plot(column='logn', ax=ax[0][i], legend=True, cmap='OrRd')
ax[0][i].axis('off')
ax[0][i].set_title(titles[i] + ' industry log(kg) received')
for i,v in enumerate(industryFlows):
v.plot(column='lag_logn', ax=ax[1][i], legend=True, cmap='OrRd')
ax[1][i].axis('off')
ax[1][i].set_title(titles[i] + ' industry log(kg) received (spatial lag)')
plt.show()
Although these p-values in the previous paragraph are impressive, it isn't accurate because we've simplified the data into binary values, losing a lot of information in the process. In order to get a more accurate understanding, we will do the same process on our dataset with continuous values. However, with continuous values, we cannot use join counts to analyse the data, because there are an infinite number of join types between neighbors. So instead, we use another method to measure spatial clustering - Moran's I. (Here's a helpful video explaining it)
Moran's I ranges from -1 to 1. 1 means that the data is perfectly clustered; 0 means perfectly random; and -1 means perfectly dispersed.
from IPython.display import Image
print('Moran\'s I')
Image("../images/morans_i.png", width=600)
# calculate moran's I for log(kg) for 4 types of industry flows
# make dictionary of moran's I
mis = dict({
'agri': None,
'manu': None,
'cons': None,
'all': None
})
# calculate moran's I
titles = ['agri', 'manu', 'cons', 'all']
for n,i in enumerate([agri, manu, cons, allin]):
y = i['logn']
np.random.seed(12345)
mi = esda.moran.Moran(y, wq)
mis[titles[n]] = [mi, round(mi.I,2)]
print('Moran\'s I for {} industry flows: {}'.format(titles[n], round(mi.I,2)))
# histplot of Moran's I values
print('')
mis = pd.DataFrame.from_dict(mis).T
mis.rename(columns={1: 'morans_I'}, inplace=True)
mis.drop(labels=[0], axis=1, inplace=True)
sn.histplot(data=mis, x='morans_I', bins=10)
plt.show()
The Moran's Is for the industry flows range between 0.03-0.24, which seems quite low. However, Moran's I cannot be interpreted at face value, because it is an inferential statistic (at least according to this website). We can only interpret this number by comparing it to the moran's I of random simulations, similar to the previous process with the binary data. In the figure below, the blue curve shows the distribution of Moran's I values in 999 simulations, the black line shows the average Moran's I value of the simulations, and the red line shows the Moran's I of our actual data.
In the figures below, we can see that the log(kg) values have a statistically significant spatial clustering (p-value = 0.001), while the spatial clustering of the normal (kg) values are not statistically significant (p-value = 0.088 > 0.05). This means that it would be worth conducting spatial analysis on the log(kg) values, but not on the normal kg values. We should also explore further the spatial clustering of different material types to see if those are statistically significant as well.
fig, ax = plt.subplots(1,4, figsize=(8*4, 6))
for i,v in enumerate(list(mis.keys())):
mi = mis[v]
sn.kdeplot(mi.sim, shade=True, ax=ax[i]) # kde plot of moran's I for 999 simulations
ax[i].vlines(mi.I, 0, 40, color='r')
ax[i].vlines(mi.EI, 0, 40)
ax[i].set_xlabel("Moran's I")
ax[i].set_title("simulated vs real Morans I for {} industry".format(v))
print('p-value for Morans I of {} industry: {}'.format(v, mi.p_sim))
print('')
plt.show()
Although all p-values are lower than 0.05, there is a chance that the spatial distribution of the manufacturing industry receivers is random - the red line overlaps the blue curve in the second graph. This means that it might not be worth analyzing the manufacturing industry flows, because there isn't a strong spatial pattern here. Again, the next step is either to refine the sbi code matching method to allow more flows to match with manufacturing sbi codes; or to categorize the flows using materials rather than industry type.
The previous analysis explained how spatially clustered the data is as a whole, or its 'global' spatial auto-correlation. Now, we will try to understand the local neighborhood conditions of each postcode, and identifying postcodes that are outliers. To get a better understanding of the data, we created a scatterplot that shows, for each postcode, the relationship for kg received and average kg received by its neighbors. The figure on the right shows a scatterplot of log(kg) values.
The scatterplot can be understood by separating it into four quadrants (the black dotted lines):
np.random.seed(12345)
fig, ax = plt.subplots(1,4, figsize=(8*4, 8))
titles = ['agri', 'manu', 'cons', 'all']
for i,v in enumerate([agri, manu, cons, allin]):
# remove zero values
v = v[v.logn != 0]
# PREPARE VALUES
# create values for moran scatterplot of normal values
kg = v['logn']
lag_kg = v['lag_logn'] # average kg_received of neighbors
# polynomial of degree 1, aka draw a straight line that best fits the datapoints, where b, a is (y = bx + a)
b, a = np.polyfit(kg, lag_kg, 1)
# PLOT
# scatterplot of kg versus lagged kg
ax[i].plot(kg, lag_kg, '.', color='lightblue')
# red line of best fit using global I as slope
ax[i].plot(kg, b*kg+a, 'r')
ax[i].set_title('Moran Scatterplot for {} industry'.format(titles[i]))
ax[i].set_ylabel('Spatial Lag of log(kg)')
ax[i].set_xlabel('log(kg)')
# dashed lines at mean of the kg and lagged kg
ax[i].vlines(kg.mean(), lag_kg.min(), lag_kg.max(), linestyle='--')
ax[i].hlines(lag_kg.mean(), kg.min(), kg.max(), linestyle='--')
plt.show()
There seems to be barely any correlation between the amount of waste received by each postcode, and the amount received by its neighbors. This suggests that the spatial pattern is weak, especially for the manufacturing industry. If there was a strong spatial pattern, there should be more correlation between how much a pc receives and how much is received by its neighbor. And a strong spatial pattern would mean that kg received by a postcode has something to do with its location.
# plot islands
fig, ax = plt.subplots(1,1,figsize=(5,5))
pc4.plot(ax=ax, color='lightgrey')
pc4.iloc[wq.islands].plot(ax=ax, color='red')
ax.set_title('Islands ({} in total)'.format(len(wq.islands)))
ax.axis('off')
plt.show()
fixing disconnected components: https://geographicdata.science/book/notebooks/04_spatial_weights.html
# make spatial weight matrix with no islands
pcIslands = list(pc4.iloc[wq.islands].PC4) # identify island pcs
pc4ni = pc4[~pc4.PC4.isin(pcIslands)]
wq = lp.weights.Queen.from_dataframe(pc4ni)
wq.transform = 'r'
# calculate local moran's I
for i, d in enumerate([agri, manu, cons, allin]):
d = d[~d.PC4.isin(pcIslands)] # drop rows with islands
y = d['logn']
li = esda.moran.Moran_Local(y, wq)
n_sig = (li.p_sim < 0.05).sum()
# print stats
titles = ['agri', 'manu', 'cons', 'all']
print(''' {}
Number of simulations for local morans I values: {}
p-value of local morans I for each pc: {}
% of pcs with a statistically significant value: {}
'''.format(titles[i], len(li.sim), li.p_sim, round((li.p_sim < 0.05).sum()/len(df) * 100, 2)))
# create values for hotspots, coldspots, diamonds, and doughnuts
sig = li.p_sim < 0.05 # array showing whether each pc has p-value < 0.05
hotspot = sig * li.q==1 # array showing whether each pc is in q1 AND has p-value < 0.05
coldspot = sig * li.q==3
doughnut = sig * li.q==2
diamond = sig * li.q==4
# plot hotspots, coldspots, diamonds, and doughnuts
f,ax = plt.subplots(1,4,figsize=(6*4,8), subplot_kw=dict(aspect='equal'))
def plotSpots(spotType, color, i, title):
spots = ['n.sig.', title]
labels = [spots[i] for i in spotType*1] # why is it hotspot*1 and not just hotspot?
from matplotlib import colors
hmap = colors.ListedColormap([color, 'lightgrey'])
d.assign(cl=labels).plot(column='cl', categorical=True,
k=2, cmap=hmap, linewidth=0.1, ax=ax[i], # how does it know what order to put the colors?
edgecolor='white', legend=True, legend_kwds={'loc': 'lower left', 'bbox_to_anchor':(0.,0.,0.,0.)})
ax[i].set_axis_off()
ax[i].set_title(title)
plotSpots(spotType=hotspot, color='red', i=0, title='hotspots')
plotSpots(spotType=coldspot, color='blue', i=1, title='coldspots')
plotSpots(spotType=doughnut, color='blue', i=2, title='doughnuts')
plotSpots(spotType=diamond, color='red', i=3, title='diamonds')
plt.show()
for i, d in enumerate([agri]):
d = d[~d.PC4.isin(pcIslands)] # drop rows with islands
y = d['logn']
li = esda.moran.Moran_Local(y, wq)
n_sig = (li.p_sim < 0.05).sum()
# print stats
titles = ['agri', 'manu', 'cons', 'all']
print(''' {}
Number of simulations for local morans I values: {}
p-value of local morans I for each pc: {}
% of pcs with a statistically significant value: {}
'''.format(titles[i], len(li.sim), li.p_sim, round((li.p_sim < 0.05).sum()/len(df) * 100, 2)))
# create values for hotspots, coldspots, diamonds, and doughnuts
sig = li.p_sim < 0.05 # array showing whether each pc has p-value < 0.05
hotspot = sig * li.q==1 # array showing whether each pc is in q1 AND has p-value < 0.05
coldspot = sig * li.q==3
doughnut = sig * li.q==2
diamond = sig * li.q==4
from matplotlib import colors
hmap = colors.ListedColormap(['lightgrey', 'red', 'lightblue', 'blue', 'pink'])
titles = ['agri', 'manu', 'cons', 'all']
f, ax = plt.subplots(1,4, figsize=(6*4,8))
for i, d in enumerate([agri, manu, cons, allin]):
# calculate local Moran's I for each df
d = d[~d.PC4.isin(pcIslands)] # drop rows with islands
y = d['logn']
li = esda.moran.Moran_Local(y, wq)
n_sig = (li.p_sim < 0.05).sum()
# values of hotspots, coldspots...etc.
sig = 1 * (li.p_sim < 0.05)
hotspot = 1 * (sig * li.q==1)
coldspot = 3 * (sig * li.q==3)
doughnut = 2 * (sig * li.q==2)
diamond = 4 * (sig * li.q==4)
spots = hotspot + coldspot + doughnut + diamond
spot_labels = [ '0 ns', '1 hot spot', '2 doughnut', '3 cold spot', '4 diamond']
labels = [spot_labels[i] for i in spots]
# plot
d.assign(cl=labels).plot(column='cl', categorical=True, \
k=2, cmap=hmap, linewidth=0.1, ax=ax[i], \
edgecolor='white', legend=True, legend_kwds={'loc': 'lower left', 'bbox_to_anchor':(0.,0.,0.,0.)})
ax[i].set_axis_off()
ax[i].set_title(titles[i])
plt.show()
Would be good to verify these hotspots on google maps. For example, if we see that a hotspot of the agriculture industry is right in the middle of an industrial area, we know that there is probably something wrong with the data. I'm asking this question because the hotspots for manufacturing industry receivers don't seem to be in major industrial areas.
agri
for i, d in enumerate([agri]):
# calculate local Moran's I for each df
d = d[~d.PC4.isin(pcIslands)] # drop rows with islands
y = df['kg']
li = esda.moran.Moran_Local(y, wq)
n_sig = (li.p_sim < 0.05).sum()
# values of hotspots, coldspots...etc.
sig = 1 * (li.p_sim < 0.05)
hotspot = 1 * (sig * li.q==1)
coldspot = 3 * (sig * li.q==3)
doughnut = 2 * (sig * li.q==2)
diamond = 4 * (sig * li.q==4)
spots = hotspot + coldspot + doughnut + diamond
spot_labels = [ '0 ns', '1 hot spot', '2 doughnut', '3 cold spot', '4 diamond']
labels = [spot_labels[i] for i in spots]
# plot
df.assign(cl=labels).plot(column='cl', categorical=True, \
k=2, cmap=hmap, linewidth=0.1, ax=ax[i], \
edgecolor='white', legend=True, legend_kwds={'loc': 'lower left', 'bbox_to_anchor':(0.,0.,0.,0.)})
ax[i].set_axis_off()
ax[i].set_title(titles[i])